Load packages

If you are using the package for the first time, you will have to first install it.

# install.packages("survival") 
library(survival)

Get data

pbc <- survival::pbc

Smart programming

  • Keep good notes!

Code 1:

x<-10
y<-10/2
z<-x+y

Code 2:

# Store a value
x <- 10
# Take half of the stored value
y <- x/2
# Get the sum of both values
z <- x + y

Which code (1 or 2) would you prefer?

  • Always check whether you can make your code faster.

Create a matrix:

A <- matrix(data = rnorm(n = 1e06), nrow = 1000, ncol = 1000)

Calculate the sum per row:

system.time( apply(A, 1, sum) )
##    user  system elapsed 
##    0.03    0.00    0.03
system.time( rowSums(A) )
##    user  system elapsed 
##       0       0       0

Calculate the sum per row, repeat this procedure 100 times:

res1 <- replicate(n = 100, expr = apply(A, 1, sum))
res2 <- replicate(n = 100, expr = rowSums(A)) # use specialized functions

Compute the cumulative sum of a numeric vector. (cumulative sum of 1, 2, 3, 4 is 1, 3, 6, 10) First we create a numerical vector:

x <- rnorm(n = 1e06, mean = 10, sd = 10)

# The cumulative sum in the beginning is 0:
cSum <- 0 
# Create an empty vector:
z <- numeric() 
for (k in 1:length(x)){
  cSum <- cSum + x[k]
  z[k] <- cSum
}

Explore how it works:

k <- 1
cSum <- 0
cSum <- cSum + x[k]

k <- 2
cSum <- cSum + x[k]

Use specialized functions:

cSum <- cumsum(x)

Explore the timing of the code.

cSum <- 0 
z <- numeric() 
system.time({
  cSum <- 0
  for (i in 1:length(x)){
    cSum <- cSum + x[i]
    z[i] <- cSum
  }
})
##    user  system elapsed 
##    0.23    0.02    0.25
# better
system.time( cumsum(x) ) 
##    user  system elapsed 
##       0       0       0

More than one ways exist to do something in R!

Functions / loop / if statement

Calculate the mean weight for males and females in 100 datasets.
Since we do not have 100 data sets, we will create them!
Let’s do that first manually…

datlist <- list()

i <- 1
set.seed(123+i)
patient <- c(1:20)
weight <- rnorm(20, 70, 10)
sex <- sample(1:2, 20, replace = TRUE)
sex <- factor(sex, levels = 1:2, labels = c("male", "female"))

datlist[[i]] <- data.frame(patient, weight, sex)

i <- 2
set.seed(123+i)
patient <- c(1:20)
weight <- rnorm(20, 70, 10)
sex <- sample(1:2, 20, replace = T)
sex <- factor(sex, levels = 1:2, labels = c("male", "female"))

datlist[[i]] <- data.frame(patient, weight, sex)
# ..... 

It is too much work! Now use a loop:

for (i in 1:100) {
  set.seed(123+i)
  patient <- c(1:20)
  weight <- rnorm(20, 70, 10)
  sex <- sample(1:2, 20, replace = T)
  sex <- factor(sex, levels = 1:2, labels = c("male", "female"))
  
  datlist[[i]] <- data.frame(patient, weight, sex)
}

Now that we have 100 data sets we need to calculate the mean weight per gender in each of them.
Let’s do that manually…

means <- matrix(data = NA, nrow = length(datlist), ncol = 2)

i <- 1
dat <- datlist[[i]]
means[i, ] <- tapply(dat$weight, dat$sex, mean)

i <- 2
dat <- datlist[[i]]
means[i, ] <- tapply(dat$weight, dat$sex, mean)
# .....

Now use a loop:

for (i in 1:length(datlist)) {
  dat <- datlist[[i]]
  means[i, ] <- tapply(dat$weight, dat$sex, mean)
}

means
##            [,1]     [,2]
##   [1,] 72.67093 65.35979
##   [2,] 73.58761 70.54592
##   [3,] 67.70248 71.18005
##   [4,] 68.26480 71.41099
##   [5,] 70.32488 73.52906
##   [6,] 75.88853 65.13093
##   [7,] 70.95698 68.43297
##   [8,] 69.47857 68.85621
##   [9,] 67.04953 75.86609
##  [10,] 67.70928 74.10217
##  [11,] 69.48756 71.42291
##  [12,] 67.88684 72.51510
##  [13,] 67.49031 70.37902
##  [14,] 74.62734 72.50830
##  [15,] 70.35835 65.61337
##  [16,] 69.43763 70.10850
##  [17,] 76.18847 76.72335
##  [18,] 69.22957 70.67279
##  [19,] 70.32851 75.38167
##  [20,] 74.26989 67.86045
##  [21,] 66.16984 64.05326
##  [22,] 73.74897 76.30083
##  [23,] 67.17520 69.11970
##  [24,] 70.61564 68.93750
##  [25,] 74.76173 70.34485
##  [26,] 66.01594 63.84540
##  [27,] 66.13668 69.79401
##  [28,] 69.51881 69.14171
##  [29,] 67.92588 70.58051
##  [30,] 65.45927 70.48295
##  [31,] 69.70336 72.30258
##  [32,] 69.22542 71.23817
##  [33,] 73.09973 66.59665
##  [34,] 72.31334 78.05853
##  [35,] 63.94101 67.19878
##  [36,] 64.36021 67.98514
##  [37,] 72.59174 69.79940
##  [38,] 72.38843 75.25830
##  [39,] 65.96138 69.01231
##  [40,] 67.26918 66.50526
##  [41,] 72.93189 69.08579
##  [42,] 70.82892 69.04613
##  [43,] 73.95355 70.65879
##  [44,] 66.95142 66.91990
##  [45,] 71.88453 71.12322
##  [46,] 71.24994 72.46953
##  [47,] 75.84781 72.89258
##  [48,] 72.13181 69.25334
##  [49,] 72.83673 70.41300
##  [50,] 73.42010 77.15810
##  [51,] 71.30431 75.75554
##  [52,] 75.05188 72.56145
##  [53,] 71.41672 71.90575
##  [54,] 73.38978 72.47846
##  [55,] 73.58065 75.76330
##  [56,] 69.32783 76.11282
##  [57,] 69.10112 73.41443
##  [58,] 68.64874 69.88914
##  [59,] 71.54627 73.45020
##  [60,] 68.29513 72.58044
##  [61,] 69.86955 70.82768
##  [62,] 81.05653 67.28846
##  [63,] 66.04835 71.89415
##  [64,] 72.67207 66.32415
##  [65,] 74.50970 64.55474
##  [66,] 69.63641 72.14194
##  [67,] 71.15808 73.00673
##  [68,] 71.27653 75.53417
##  [69,] 72.33656 69.00526
##  [70,] 72.51226 69.50037
##  [71,] 68.70055 69.83255
##  [72,] 65.10849 72.32741
##  [73,] 68.65566 71.51932
##  [74,] 74.09553 74.52641
##  [75,] 70.16617 70.32832
##  [76,] 74.44624 66.83484
##  [77,] 70.87837 68.30180
##  [78,] 65.65148 70.65974
##  [79,] 71.55960 67.12177
##  [80,] 71.62386 69.18857
##  [81,] 63.99797 68.16817
##  [82,] 72.96528 72.09290
##  [83,] 68.39122 71.72255
##  [84,] 66.43081 67.23526
##  [85,] 72.02173 69.11552
##  [86,] 76.02071 66.53986
##  [87,] 72.99379 71.37415
##  [88,] 69.58583 68.32469
##  [89,] 72.60383 65.49518
##  [90,] 72.97765 63.53318
##  [91,] 64.82462 70.53395
##  [92,] 74.47472 69.85521
##  [93,] 73.78986 78.60479
##  [94,] 68.86772 68.62155
##  [95,] 69.59406 69.86890
##  [96,] 70.38825 67.42225
##  [97,] 64.07105 71.27923
##  [98,] 64.77403 70.14883
##  [99,] 71.48067 66.56438
## [100,] 73.85559 65.00337

Select the data sets, where more than 39% of the patients are females:

# We will store the data sets in a list called newList:
newList <- list()

# Using a for loop and an if statement we can go through the data sets and
# check the percentages of females:
for (i in 1:length(datlist)) {
  dat <- datlist[[i]]
  if (sum(dat$sex == "female")/20 > 0.39) {
    newList[[i]] <- dat
  }
}

Now we need to remove the elements of the list that are NULL:

newList <- newList[!sapply(newList, is.null)]
length(newList)
## [1] 91

Create a function that takes as input the data sets in a list format, the name of the sex variable and the name of the female category.
This function returns only the data sets, where more than 39% of the patients are females.
The output should be a list.

subset_data <- function(dataset = x, sex_var = "sex", female_cat = "female"){
  newList <- list()
  for (i in 1:length(dataset)) {
    dat <- dataset[[i]]
    if (sum(dat[sex_var] == female_cat)/20 > 0.39) {
      newList[[i]] <- dat
    }
  }
  newList <- newList[!sapply(newList, is.null)]
}

res <- subset_data(dataset = datlist, sex_var = "sex", female_cat = "female")

length(res)
## [1] 91

Make a function that takes as input a data set, the name of a continuous variable and the name of a categorical variable. This function calculates the mean and standard deviation of the continuous variable for each group in the categorical variable.

des <- function(data = x, cont = "age", cat = "group"){
  c(tapply(data[[cont]], data[[cat]], mean),
    tapply(data[[cont]], data[[cat]], sd))
}

Apply the function to the pbc data set. Use age and sex as continuous and categorical variables.

des(data = pbc, cont = "age", cat = "sex")
##        m        f        m        f 
## 55.71072 50.15694 10.97780 10.24065

Why do we use [[]] and not []?

data = pbc
cont = "age"
cat = "sex"
data[[cont]] # this is in a vector format
##   [1] 58.76523 56.44627 70.07255 54.74059 38.10541 66.25873 55.53457 53.05681
##   [9] 42.50787 70.55989 53.71389 59.13758 45.68925 56.22177 64.64613 40.44353
##  [17] 52.18344 53.93018 49.56057 59.95346 64.18891 56.27652 55.96715 44.52019
##  [25] 45.07324 52.02464 54.43943 44.94730 63.87680 41.38535 41.55236 53.99589
##  [33] 51.28268 52.06023 48.61875 56.41068 61.72758 36.62697 55.39220 46.66940
##  [41] 33.63450 33.69473 48.87064 37.58248 41.79329 45.79877 47.42779 49.13621
##  [49] 61.15264 53.50856 52.08761 50.54073 67.40862 39.19781 65.76318 33.61807
##  [57] 53.57153 44.56947 40.39425 58.38193 43.89870 60.70637 46.62834 62.90760
##  [65] 40.20260 46.45311 51.28816 32.61328 49.33881 56.39973 48.84600 32.49281
##  [73] 38.49418 51.92060 43.51814 51.94251 49.82615 47.94524 46.51608 67.41136
##  [81] 63.26352 67.31006 56.01369 55.83025 47.21697 52.75838 37.27858 41.39357
##  [89] 52.44353 33.47570 45.60712 76.70910 36.53388 53.91650 46.39014 48.84600
##  [97] 71.89322 28.88433 48.46817 51.46886 44.95003 56.56947 48.96372 43.01711
## [105] 34.03970 68.50924 62.52156 50.35729 44.06297 38.91034 41.15264 55.45791
## [113] 51.23340 52.82683 42.63929 61.07050 49.65640 48.85421 54.25599 35.15127
## [121] 67.90691 55.43600 45.82067 52.88980 47.18138 53.59890 44.10404 41.94935
## [129] 63.61396 44.22724 62.00137 40.55305 62.64476 42.33539 42.96783 55.96167
## [137] 62.86105 51.24983 46.76249 54.07529 47.03628 55.72621 46.10267 52.28747
## [145] 51.20055 33.86448 75.01164 30.86379 61.80424 34.98700 55.04175 69.94114
## [153] 49.60438 69.37714 43.55647 59.40862 48.75838 36.49281 45.76044 57.37166
## [161] 42.74333 58.81725 53.49760 43.41410 53.30595 41.35524 60.95825 47.75359
## [169] 35.49076 48.66256 52.66804 49.86995 30.27515 55.56742 52.15332 41.60986
## [177] 55.45243 70.00411 43.94251 42.56810 44.56947 56.94456 40.26010 37.60712
## [185] 48.36140 70.83641 35.79192 62.62286 50.64750 54.52704 52.69268 52.72005
## [193] 56.77207 44.39699 29.55510 57.04038 44.62697 35.79740 40.71732 32.23272
## [201] 41.09240 61.63997 37.05681 62.57906 48.97741 61.99042 72.77207 61.29500
## [209] 52.62423 49.76318 52.91444 47.26352 50.20397 69.34702 41.16906 59.16496
## [217] 36.07940 34.59548 42.71321 63.63039 56.62971 46.26420 61.24298 38.62012
## [225] 38.77070 56.69541 58.95140 36.92266 62.41478 34.60917 58.33539 50.18207
## [233] 42.68583 34.37919 33.18275 38.38193 59.76181 66.41205 46.78987 56.07940
## [241] 41.37440 64.57221 67.48802 44.82957 45.77139 32.95003 41.22108 55.41684
## [249] 47.98084 40.79124 56.97467 68.46270 78.43943 39.85763 35.31006 31.44422
## [257] 58.26420 51.48802 59.96988 74.52430 52.36413 42.78713 34.87474 44.13963
## [265] 46.38193 56.30938 70.90760 55.39493 45.08419 26.27789 50.47228 38.39836
## [273] 47.41958 47.98084 38.31622 50.10815 35.08830 32.50376 56.15332 46.15469
## [281] 65.88364 33.94387 62.86105 48.56400 46.34908 38.85284 58.64750 48.93634
## [289] 67.57290 65.98494 40.90075 50.24504 57.19644 60.53662 35.35113 31.38125
## [297] 55.98631 52.72553 38.09172 58.17112 45.21013 37.79877 60.65982 35.53457
## [305] 43.06639 56.39151 30.57358 61.18275 58.29979 62.33265 37.99863 33.15264
## [313] 60.00000 64.99932 54.00137 75.00068 62.00137 43.00068 46.00137 44.00000
## [321] 60.99932 64.00000 40.00000 63.00068 34.00137 52.00000 48.99932 54.00137
## [329] 63.00068 54.00137 46.00137 52.99932 56.00000 56.00000 55.00068 64.99932
## [337] 56.00000 47.00068 60.00000 52.99932 54.00137 50.00137 48.00000 36.00000
## [345] 48.00000 70.00137 51.00068 52.00000 54.00137 48.00000 66.00137 52.99932
## [353] 62.00137 59.00068 39.00068 67.00068 58.00137 64.00000 46.00137 64.00000
## [361] 40.99932 48.99932 44.00000 59.00068 63.00068 60.99932 64.00000 48.99932
## [369] 42.00137 50.00137 51.00068 36.99932 62.00137 51.00068 52.00000 44.00000
## [377] 32.99932 60.00000 63.00068 32.99932 40.99932 51.00068 36.99932 59.00068
## [385] 55.00068 54.00137 48.99932 40.00000 67.00068 68.00000 40.99932 68.99932
## [393] 52.00000 56.99932 36.00000 50.00137 64.00000 62.00137 42.00137 44.00000
## [401] 68.99932 52.00000 66.00137 40.00000 52.00000 46.00137 54.00137 51.00068
## [409] 43.00068 39.00068 51.00068 67.00068 35.00068 67.00068 39.00068 56.99932
## [417] 58.00137 52.99932
data[cont] # this is in a matrix format
# The tapply works with vector format data.

Now change the des function so that the user would specify the function applied to the data set.

des <- function(data = x, cont = "age", cat = "group", fun = mean){
  tapply(data[[cont]], data[[cat]], fun)
}

des(data = pbc, cont = "age", cat = "sex", fun = function(x) { sum(x) + 1 } )
##         m         f 
##  2452.272 18759.697